**Purpose: This file evaluates and corrects missingness in the dataset created in "1-cleaning_public_010320" do file. Please run that file prior to running this file.

use "pew_data_allyrs.dta", clear

*Primary Independent Variable Checks
tab citizen, m
tab citizen birthcountry, col m 

tab nativity birthcountry, m // 4 born in USA, label US citizens; 7 born outside USA, label foreign born; 5 born in USA classified as foreign born, label US born
replace nativity = 1 if birthcountry==3 & nativity==.
replace nativity = 0 if birthcountry==0 & nativity==.
replace nativity = 0 if birthcountry==0 & nativity==1
tab nativity birthcountry, m //1 classified as US born but born outside USA, label US born
replace birthcountry = 0 if nativity==0 & birthcountry==3

tab nativity legal_status, m // 4 born in USA, label US-born citizens
replace nativity = 0 if legal_status==1 & nativity==.

tab nativity parentborn,  m //27 who are foreign born but have no FB parent - change to US born
replace nativity = 0 if nativity==1 & parentborn==0

tab citizen legal_status, col m //4 LPRs missing on citizenship, make noncitizens; 2 USC missing on legal status
replace citizen = 0 if citizen==. & legal_status==3
replace legal_status=1 if legal_status==. & citizen==1 & birthcountry==0
replace legal_status=2 if legal_status==. & citizen==1 & birthcountry !=0 & birthcountry !=.


browse if age != yearslivedusa & nativity==. //2 respondents missing on yearslived and nativity 

*Household Adults
tab hh_adults, m // 13 still labeled as DK/Refused
replace hh_adults=. if hh_adults==9
replace hh_adults = 8 if hh_adults >8 & hh_adults !=.
tab hh_adults

tab marital hh_adults, m
//125 people who live alone and for whom marital status is missing; label single
replace marital = 0 if marital==. & hh_adults==1

*Employment
tab employment occupation, col m // 67 people who have missing employment but unemployed under occupation
replace employment = 0 if employment==. & occupation==3

*Parental Status
tab parent child18, col m // 1134 respondents who we know aren't parents; 789 parents who we don't know if they have any children under 18 in hh--assume they do
replace child18 =0 if child18==. & parent==0

tab child18,m
tab parent,m

tab parent child6, col m // 3735 childunder6 missing who we know aren't parents
replace child6 =0 if parent==0 & child6==.

**Missingness by category

tab citizen, m //n = 37
tab worry2 citizen, col chi2 m // n 102; 39 (1.27%) noncitizens, 60 (1.09) us citizens; 3 missing on citizenship and worries; pr < .000; 10/37 missing on citizenship but don't worry; 24/37 missing on citizenship but worry

gen age_missing = 0
replace age_missing =1 if age==.
tab age_missing

tab age_missing citizen, col chi2 //not significant

tab sex, m // none missing

gen educ_missing = 0
replace educ_missing =1 if educ==.
tab educ_missing

tab educ_missing citizen, col chi2 // 2.96% noncitizens, 2.08% citizens p = .008

gen pparty_missing = 0
replace pparty_missing = 1 if pparty==.
tab pparty_missing

tab pparty_missing citizen, col chi2 // 17.83% noncitizens missing, 14.47% citizen p =.000

gen hh_adults_missing = 0
replace hh_adults_missing = 1 if hh_adults==.
tab hh_adults_missing // 670 missing

tab hh_adults_missing citizen, col chi2 // 6.60% noncitizens, 8.41% citizens. p = 0.003

tab census_region_res, m //none missing


***key variables***
*full sample if i had complete data = 8593

tab worry2,m //102 missing 
tab citizen, m //37 missing
tab legal_status, m // 237 missing, 199 from 2013 (can't ditinguish among noncitizens)
tab age, m //318 missing
tab sex, m // 0 missing
tab educ, m // 209 missing
tab pparty, m // 1347 missing (255 in 2007; 192 in 2008; 778 in 2010)
tab registeredvoter, m // 147 missing
tab  hh_adults, m // 670 missing
tab census_region_res, m // 0 missing
tab svy_year, m // 0 missing

tab hhincome, m // 2102 missing
tab hhincome2, m // 6422 missing
tab marital , m // 769 missing
tab parent, m // 1391 missing
tab intvlang , m // 1001 missing (all from 2017, not asked)
tab intvrace, m // 6578 missing (only asked in 2008)
tab parentborn, m // 2593 missing
tab employment, m // 831 missing
tab immsyst, m // 7892 missing (only asked in 2013, none missing)

*check if missingness varies by worry2, citizen, legal_status
***worry2
*drop varmissing
gen varmissing = (worry2==.)
tab citizen varmissing, col chi2 // missing worries don't vary by citizenships
ttest age, by(varmissing) // more likely to be older (53.6 v. 42.8) if missing; p < .001
ttest sex, by(varmissing) // does not vary
tab educ varmissing, col chi2 // people without HS education most likely to miss (54%; p < .01)
tab pparty varmissing, col chi2 // does not vary
tab registeredvoter varmissing, col chi2 // does not vary
ttest hh_adults, by(varmissing) // does not vary

ttest hhincome, by(varmissing) //does not vary
ttest hhincome2, by(varmissing) //does not vary
tab marital varmissing, col chi2 //more likely to miss among unmarried/unpartnered (55.7% v. 44.3%)
tab parent varmissing, col chi2 // more likely to miss among nonparents p < .05
tab intvlanguage varmissing if svy_year !=2016, col chi2 // more likely to miss among spanish speakers p < .01
tab intvrace varmissing if svy_year==2008, col chi2 //does not vary
tab census_region_res varmissing, col chi2 // more likely to miss in the south
tab svy_year varmissing, col chi2 // does not vary

***citizenship
drop varmissing
gen varmissing = (citizen==.)
tab worry2 varmissing, col chi2 //more likely to miss citizenship among those who worry a lot
ttest age, by(varmissing) //does not vary
ttest sex, by(varmissing) //does not vary
tab educ varmissing, col chi2 // more likely to miss among >HS graduates (55.9%)
tab pparty varmissing, col chi2 // does not vary
tab registeredvoter varmissing, col chi2 // does not vary
ttest hh_adults, by(varmissing) //does not vary

tab hhincome varmissing, col chi2 // more likely to miss among <$50K hh
tab hhincome2 varmissing, col chi2 // does not vary
tab marital varmissing, col chi2 // does not vary
tab intvlanguage varmissing, col chi2 // more likely to miss in Spanish survey (p < .001)
tab intvrace varmissing, col chi2 // does not vary
tab census_region_res varmissing, col chi2 // does not vary
tab svy_year varmissing, col chi2 // p < .001; most likely to miss in 2010 (48.9%)

*generate filters to be used in main analysis

bysort svy_year: gen filter_citizens = 0
bysort svy_year: replace filter_citizens = 1 if worry2 !=. & citizen !=. & age !=. & sex !=. & educ!=. & pparty !=. & hh_adults !=. & census_region_res !=. 
sum filter_citizens
bysort svy_year: tab filter_citizens 

*Filter to use for legal_status

bysort svy_year: gen filter_legal = 0
bysort svy_year: replace filter_legal = 1 if worry2 !=. & legal_status !=. & age !=. & sex !=. & educ!=. & pparty !=. & hh_adults !=. & census_region_res !=. 
sum filter_legal
bysort svy_year: tab filter_legal

*reorder legal_status variable

recode legal_status (1=3 "U.S.-born Citizens") (2=2 "Naturalized Citizens") (3=1 "Lawful Permanent Residents") (4=0 "Likely-Undocumented Immigrants"), gen(legal_order)
label variable legal_order "Legal Status (ordered)"
tab legal_order, m

****check if analytic samples differ from full sample
*filter_citizens

ttest worry, by(filter_citizens) // does not vary
ttest citizen, by(filter_citizens) //does not vary
ttest age, by(filter_citizens) // analytic sample is about 4 years younger (46.1 v 42.0) p < .001
ttest sex, by(filter_citizens) // analytic sample is more male p < .05
tab educ filter_citizens, col chi2 // fewer <HS, more HS, more some college, more college+
tab pparty filter_citizens, col chi2 // p<05; fewer independents, more everyone else
tab registeredvoter filter_citizens, col chi2 // more registered voters
ttest hh_adults, by(filter_citizens) // slightly larger households
tab census_region_res filter_citizens, col chi2 // does not vary
tab svy_year filter_citizens, col chi2 // more 2007, more 2008, fewer 2010, more 2013, more 2016, fewer 2018

*filter_legal

ttest worry, by(filter_legal) // does not vary
tab legal_status filter_legal, col chi2 //more us born citiznes fewer undocumented
ttest age, by(filter_legal) // p < .001 about four years younger 
ttest sex if svy_year, by(filter_legal) // p < .05 more male
tab educ filter_legal, col chi2 // p < .001 fewer <HS and more everyone else
tab pparty filter_legal, col chi2 // p < .001 fewer Independents, more Dems, slighly more Repub
tab registeredvoter filter_legal, col chi2  // p < .001; more registered voters
ttest hh_adults, by(filter_legal) // does not vary
tab census_region_res filter_legal , col chi2 // does not vary
tab svy_year filter_legal  , col chi2 // p < .001 more in 2007, 2008, 2016, 20818; fewer in 2010, 2013

save "pew_data_allyrs.dta", replace

***GO TO 3-analysis_010320 FILE NOW***
